Dictionary Usage¶
Dictionary Usage as a Lookup Table¶
sine = [-0.0000, -0.0495, -0.0988, -0.1479,
-0.1966, -0.2449, -0.2925, -0.3394,
-0.3855, -0.4307, -0.4748, -0.5177,
-0.5594, -0.5997, -0.6386, -0.6758,
-0.7115, -0.7453, -0.7774, -0.8076,
-0.8357, -0.8619, -0.8859, -0.9078,
-0.9274, -0.9448, -0.9598, -0.9725,
-0.9828, -0.9908, -0.9963, -0.9993,
-0.9999, -0.9981, -0.9938, -0.9871,
-0.9780, -0.9665, -0.9526, -0.9364,
-0.9179, -0.8971, -0.8742, -0.8491,
-0.8219, -0.7927, -0.7616, -0.7286,
-0.6939, -0.6574, -0.6193, -0.5798,
-0.5387, -0.4964, -0.4529, -0.4082,
-0.3626, -0.3161, -0.2688, -0.2208,
-0.1723, -0.1234, -0.0741, -0.0247,
0.0247, 0.0741, 0.1234, 0.1723,
0.2208, 0.2688, 0.3161, 0.3626,
0.4082, 0.4529, 0.4964, 0.5387,
0.5798, 0.6193, 0.6574, 0.6939,
0.7286, 0.7616, 0.7927, 0.8219,
0.8491, 0.8742, 0.8971, 0.9179,
0.9364, 0.9526, 0.9665, 0.9780,
0.9871, 0.9938, 0.9981, 0.9999,
0.9993, 0.9963, 0.9908, 0.9828,
0.9725, 0.9598, 0.9448, 0.9274,
0.9078, 0.8859, 0.8619, 0.8357,
0.8076, 0.7774, 0.7453, 0.7115,
0.6758, 0.6386, 0.5997, 0.5594,
0.5177, 0.4748, 0.4307, 0.3855,
0.3394, 0.2925, 0.2449, 0.1966,
0.1479, 0.0988, 0.0495, 0.0000,]
from math import pi
angle = 0 # offset in the table
angle_index = int((pi + angle) / (2 * pi) * (len(sine) - 1))
print(angle_index) # 63
sine[angle_index] # -0.0247 - bad accuracy, should be 0.0
from math import sin
sin(angle) # 0.0
sine[angle_index-1], sine[angle_index], sine[angle_index+1] # (-0.0741, -0.0247, 0.0247)
# Create a table of sine values
table = {round(2*pi*i/(len(sine) -1) - pi, 4): sine[i] for i in range(len(sine))}
table = {-3.1416: -0.0000, -3.0921: -0.0495, -3.0426: -0.0988, -2.9932: -0.1479,
-2.9437: -0.1966, -2.8942: -0.2449, -2.8447: -0.2925, -2.7953: -0.3394,
-2.7458: -0.3855, -2.6963: -0.4307, -2.6469: -0.4748, -2.5974: -0.5177,
-2.5479: -0.5594, -2.4984: -0.5997, -2.4490: -0.6386, -2.3995: -0.6758,
-2.3500: -0.7115, -2.3005: -0.7453, -2.2511: -0.7774, -2.2016: -0.8076,
-2.1521: -0.8357, -2.1026: -0.8619, -2.0532: -0.8859, -2.0037: -0.9078,
-1.9542: -0.9274, -1.9047: -0.9448, -1.8553: -0.9598, -1.8058: -0.9725,
-1.7563: -0.9828, -1.7068: -0.9908, -1.6574: -0.9963, -1.6079: -0.9993,
-1.5584: -0.9999, -1.5090: -0.9981, -1.4595: -0.9938, -1.4100: -0.9871,
-1.3605: -0.9780, -1.3111: -0.9665, -1.2616: -0.9526, -1.2121: -0.9364,
-1.1626: -0.9179, -1.1132: -0.8971, -1.0637: -0.8742, -1.0142: -0.8491,
-0.9647: -0.8219, -0.9153: -0.7927, -0.8658: -0.7616, -0.8163: -0.7286,
-0.7668: -0.6939, -0.7174: -0.6574, -0.6679: -0.6193, -0.6184: -0.5798,
-0.5689: -0.5387, -0.5195: -0.4964, -0.4700: -0.4529, -0.4205: -0.4082,
-0.3711: -0.3626, -0.3216: -0.3161, -0.2721: -0.2688, -0.2226: -0.2208,
-0.1732: -0.1723, -0.1237: -0.1234, -0.0742: -0.0741, -0.0247: -0.0247,
0.0247: 0.0247, 0.0742: 0.0741, 0.1237: 0.1234, 0.1732: 0.1723,
0.2226: 0.2208, 0.2721: 0.2688, 0.3216: 0.3161, 0.3711: 0.3626,
0.4205: 0.4082, 0.4700: 0.4529, 0.5195: 0.4964, 0.5689: 0.5387,
0.6184: 0.5798, 0.6679: 0.6193, 0.7174: 0.6574, 0.7668: 0.6939,
0.8163: 0.7286, 0.8658: 0.7616, 0.9153: 0.7927, 0.9647: 0.8219,
1.0142: 0.8491, 1.0637: 0.8742, 1.1132: 0.8971, 1.1626: 0.9179,
1.2121: 0.9364, 1.2616: 0.9526, 1.3111: 0.9665, 1.3605: 0.9780,
1.4100: 0.9871, 1.4595: 0.9938, 1.5090: 0.9981, 1.5584: 0.9999,
1.6079: 0.9993, 1.6574: 0.9963, 1.7068: 0.9908, 1.7563: 0.9828,
1.8058: 0.9725, 1.8553: 0.9598, 1.9047: 0.9448, 1.9542: 0.9274,
2.0037: 0.9078, 2.0532: 0.8859, 2.1026: 0.8619, 2.1521: 0.8357,
2.2016: 0.8076, 2.2511: 0.7774, 2.3005: 0.7453, 2.3500: 0.7115,
2.3995: 0.6758, 2.4490: 0.6386, 2.4984: 0.5997, 2.5479: 0.5594,
2.5974: 0.5177, 2.6469: 0.4748, 2.6963: 0.4307, 2.7458: 0.3855,
2.7953: 0.3394, 2.8447: 0.2925, 2.8942: 0.2449, 2.9437: 0.1966,
2.9932: 0.1479, 3.0426: 0.0988, 3.0921: 0.0495, 3.1416: 0.0000}
Use interpolation for values missed in the table
from bisect import bisect
# from __future__ import division
class interpolating_dict(dict):
def __missing__(self, key):
sorted_keys = sorted(self.keys())
index = bisect(sorted_keys, key)
if index == 0 or index == len(sorted_keys):
raise KeyError('cannot extrapolate value {}'.format(key))
left_key, right_key = sorted_keys[index-1], sorted_keys[index]
left_val, right_val = self[left_key], self[right_key]
slope = (right_val - left_val) / (right_key - left_key)
self[key] = value = slope * (key - left_key) + left_val
return value
# New sine object
sine = interpolating_dict(table)
print(sine[-3.1416]) # -0.0 - OK
print(sine[0]) # 0.0 - new interpolated value, saved in the table
print(sine[pi/2]) # 0.9997497414933952 - interpolated and saved in the table
print(sine[pi/4]) # 0.7069375004018476 - interpolated and saved in the table
print(sine)
# {-3.1416: -0.0,
# -3.0921: -0.0495,
# -3.0426: -0.0988,
# . . .
# 3.0426: 0.0988,
# 3.0921: 0.0495,
# 3.1416: 0.0,
# 0: 0.0,
# 1.5707963267948966: 0.9997497414933952,
# 0.7853981633974483: 0.7069375004018476}
Dictionary usage as a Relation or a Function¶
from math import sqrt
def roots(a,b,c):
return (-b + sqrt(b**2 - 4*a*c))/(2*a)
Test:
print(roots(1,2,1)) # (x+1)(x+1) == x? + 2x + 1 # -1.0
class rootdict(dict): # ???
def __missing__(self, key):
a, b, c = key
return (-b + sqrt(b**2 - 4*a*c))/(2*a)
Test:
d = rootdict()
print(d[1,2,1]) # -1.0
print(d[2,4,2]) # -1.0
from sys import version_info
assert version_info.major == 3 and version_info.minor >= 3, \
'requires PEP 362; Python 3.3 or later; python.org/dev/peps/pep-0362/'
from inspect import signature
class memoise(dict):
def __init__(self, func):
self.func, self.signature = func, signature(func)
def __missing__(self, key):
args, kwargs = key
self[key] = self.func(*args, **dict(kwargs))
return self[key]
def __call__(self, *args, **kwargs):
key = self.signature.bind(*args, **kwargs)
return self[key.args, frozenset(key.kwargs.items())]
from time import sleep
@memoise # use decorator
def roots(a,b,c):
sleep(1)
return (-b + sqrt(b**2 - 4*a*c))/(2*a)
%time roots(1,2,1) # Wall time: 1 s; -1.0
%time roots(1,2,1) # Wall time: 0 ns; -1.0 <== 0 time for known result
roots(2,4,2) # -1.0
print(roots) # {((1, 2, 1), frozenset()): -1.0, ((2, 4, 2), frozenset()): -1.0}
# it's a dictionary
# for a given arguments the results are saved !!!
# and we don't make computation again
Protocols: __getitem__ and __call__¶
d = {1: 1, 2: 4}
print(d[2]) # 4
print(d.__getitem__(2)) # 4
class Foo:
def __getitem__(self, key):
return key**2
foo = Foo()
print(foo[13]) # [ key ] -> __getitem__ => 169 value
class Foo:
def __call__(self, val):
return val**2
foo = Foo()
print(foo(10)) # ( val ) -> __call__ => 100
dict.__missing__¶
from collections import Iterable
class rangedict(dict):
def __missing__(self, key):
for k, v in self.items():
if isinstance(k, Iterable):
left, right = k
if left <= key < right:
self[key] = v
return v
raise KeyError('cannot find {} in rangedict'.format(key))
codes = rangedict({( 0, 10): 'red',
( 10, 100): 'yellow',
(100, 1000): 'green', })
print(codes[105]) # 'green'
print(codes) # {(0, 10): 'red', (10, 100): 'yellow', (100, 1000): 'green', 105: 'green'}
class passthrudict(dict):
def __missing__(self, key):
return key
censor = passthrudict({'hell': 'h***',
'darn': 'd*rn',})
sentence = "That darn cat!"
print(' '.join(censor[w] for w in sentence.split())) # 'That d*rn cat!'
sentence = "Y'all can go to hell ; I'm going to Texas!"
print(' '.join(censor[w] for w in sentence.split())) # "Y'all can go to hell; I'm going to Texas!" - no split!!!
# Equivalent to .get() but syntax is easier.
# Fix split issue
from itertools import groupby, chain
from unicodedata import category
def fancy_split(s):
''' Take string, group by unicode category (L - letters) '''
return [''.join(g) for k, g in groupby(s, key=lambda x: category(x)[0] == 'L')]
Test:
sentence = "Y'all can go to hell; I'm going to Texas!"
print(fancy_split(sentence))
Output:
['Y',
"'",
'all',
' ',
'can',
' ',
'go',
' ',
'to',
' ',
'hell',
'; ',
'I',
"'",
'm',
' ',
'going',
' ',
'to',
' ',
'Texas',
'!']
print(''.join(censor[w] for w in fancy_split(sentence)))
Output:
"Y'all can go to h***; I'm going to Texas!"
collections.defaultdict¶
student_courses = {'vinnie': {'calculus', 'diff eq'},
'arnold': {'calculus', 'linear algebra'},
'juan luis': {'real analysis'}}
# student_courses['freddie'].add('linear algebra') # Error
student_courses['freddie'] = set()
student_courses['freddie'].add('linear algebra')
Use helper called defaultdict¶
from collections import defaultdict
student_courses = defaultdict(set)
student_courses.update({'vinnie': {'calculus', 'diff eq'},
'arnold': {'calculus', 'linear algebra'},
'juan luis': {'real analysis'},
}
)
print(student_courses['freddy']) # set()
student_courses['freddie'].add('linear algebra') # create set and add to the set
print(student_courses)
Output:
defaultdict(set,
{'vinnie': {'calculus', 'diff eq'},
'arnold': {'calculus', 'linear algebra'},
'juan luis': {'real analysis'},
'freddie': {'linear algebra'}}
but we can not check if element does exist!
We always have a result
student_courses['foobar'] # set(), always
'foobar' in student_courses # False
Use Dictionary as a Hash¶
Dict type implemented as Hash table
english = ['one', 'two', 'three']
spanish = ['uno', 'dos', 'tres']
print(english.index('two')) # 1
print(spanish[english.index('three')]) # 'tres' => slow
from bisect import bisect_left
lookup = [en for en, es in sorted(zip(english, spanish))]
value = [es for en, es in sorted(zip(english, spanish))]
key = 'one'
print(value[bisect_left(lookup, key)]) # 'uno'
Performance testing¶
from numpy import array, linspace
from numpy.random import choice, shuffle
from string import ascii_lowercase
from bisect import bisect_left
MAX_SIZE = 10**4
letters = array([c for c in ascii_lowercase])
words = choice(letters, size=(MAX_SIZE, 10))
words = array([''.join(w) for w in words])
ns = linspace(1, MAX_SIZE, 500)
list_times = []
dict_times = []
bisect_times = []
for n in ns:
shuffle(words)
sample_list = list(words[:int(n)])
sample_dict = dict.fromkeys(sample_list)
sample_bisect = sorted(sample_list)
lookup_words = choice(sample_list, size=10)
# List indexing
list_time = %timeit -q -n1 -o [sample_list.index(w) for w in lookup_words]
# Dict mechanizm
dict_time = %timeit -q -n1 -o [sample_dict[w] for w in lookup_words]
# Bisect mechanizm
bisect_time = %timeit -q -n1 -o [bisect_left(sample_bisect, w) for w in lookup_words]
list_times.append(list_time.best)
dict_times.append(dict_time.best)
bisect_times.append(bisect_time.best)
%matplotlib inline
from matplotlib.pyplot import figure, plot, title, legend, ylabel, xlabel, show
fig = figure(figsize=(12,8))
# plot(ns, list_times, 'bo', label=u'list lookup ~ O(n)') # so much slower
plot(ns, dict_times, 'go', label=u'dict lookup ~ O(1)') # the best
plot(ns, bisect_times, 'ro', label=u'bisect lookup ~ O(log(n))')
title('list vs dict vs bisect (binary-search) lookup')
legend(loc='best')
xlabel('input size (elements)', figure=fig)
ylabel('best time out of 3 (s)', figure=fig)
show()
dict vs object¶
class Foo: bar = {}
pass
foo = Foo()
foo.x = 10 bar['y'] = 10
print(foo.x) # 10 print(bar['y']) # 10
from operator import getitem
print(getattr(foo, 'x')) # "getattr protocol" => 10 print(getitem(bar, 'y')) # "getitem protocol" => 10
foo.__dict__
{'x': 10}
class Base:
z = 10
class Derived(Base):
pass
d = Derived()
print(d.__dict__) # {}
print(getattr(d, 'z')) # 10
Performance: object vs dictionary¶
class Foo:
x = 10
foo = Foo()
foo.z = 10
bar = {'y': 20}
%timeit foo.z
%timeit bar['y']
# almost the same time
# 55.3 ns +/- 1.33 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
# 47.7 ns +/- 0.124 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
Performance: __getitem__ vs __getattr__ vs __getattribute__¶
class Foo:
def __getitem__(self, key):
if key == 'x':
return 10
raise KeyError('no such key {}'.format(key))
def __getattr__(self, attr):
if attr == 'x':
return 100
raise AttributeError('no such attr {}'.format(attr))
foo = Foo()
%timeit foo.x
%timeit foo['x']
# __getitem__ is faster then __getattr__
# 515 ns +/- 9.13 ns per loop (mean +/- std. dev. of 7 runs, 1000000 loops each)
# 157 ns +/- 0.307 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
# but if we use __getattribute__ instead of __getattr__:
class Foo:
def __getitem__(self, key):
if key == 'x':
return 10
raise KeyError('no such key {}'.format(key))
def __getattribute__(self, attr):
if attr == 'x':
return 100
raise AttributeError('no such attr {}'.format(attr))
foo = Foo()
%timeit foo.x
%timeit foo['x']
# performance is the same
# 144 ns +/- 2.58 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
# 158 ns +/- 0.0948 ns per loop (mean +/- std. dev. of 7 runs, 10000000 loops each)
Semantics: objects vs dict¶
function == dictionaty
__call__ == __getitem__ protocol
__getitem__ == __getattr__ protocol
Distinctions¶
? not use __missing__ ? to avoid complexity
Other approaches:
class Foo:
def __getitem__(self, key):
return key
def __getattr__(self, attr):
return attr
foo = Foo()
# __getitem__
print(foo['two words'], foo['123'], foo['abc/def']) # ('two words', '123', 'abc/def')
# __getattr__ protocol
# foo.two words # Error - attr has space
# foo.123 # Error - attr started with numbers
# foo.abc/def # Error - attr with spec chars
=> restriction on name of the attribute in objects
Examples: obj vs dict¶
1. simple class¶
class Foo:
def __init__(self):
self.x = 10
foo = Foo()
print(foo.x, foo.__dict__['x']) # 10 10 - 2 ways: .x and dict lookup
2. lookup¶
class Base:
x = 10
class Derived(Base):
pass
d = Derived()
Base.y = 100
print(d.y) # 100
# do the same (hierarchical lookup) in dict:
from collections import ChainMap
b = {}
d = ChainMap({}, b) # chains empty dict with b
b['x'] = 10
print(d['x']) # 10
b['x'] = 100
print(d['x']) # 100
d['x'] = 500
print(d['x']) # 500 - first dict in list with element
3. multiple derived class¶
class BaseA:
x = 10
class BaseB:
x = 100
y = 200
class Derived(BaseA, BaseB):
pass
d = Derived()
print(d.x, d.y) # (10, 200)
# the same in dict:
ba = {'x': 10}
bb = {'x': 100, 'y': 200}
d = ChainMap({}, ba, bb)
print(d['x'], d['y']) # (10, 200)
4. we can costruct ChainMap from __mro__ and get the same name resolution¶
print(Derived.__mro__) # (__main__.Derived, __main__.BaseA, __main__.BaseB, object)
d = ChainMap({}, *(entry.__dict__ for entry in Derived.__mro__))
print(d['x'], d['y']) # (10, 200)
5. with instance state self.x = x¶
class Foo:
def __init__(self, x):
self.x = x
foo = Foo(10)
print(foo.x) # 10
6.¶
class Foo:
def __init__(self, x):
self._x = x
@property
def x(self):
self._x += 1
return self._x
foo = Foo(10)
print(foo.x, foo.x, foo.x) # 11 12 13
# same with dictionary (we can not hide the behaviour in dict):
class Foo:
def __init__(self, x):
self._x = x
def __getitem__(self, key):
if key == 'x':
self._x += 1
return self._x
raise KeyError('no such key {}'.format(key))
foo = Foo(10)
print(foo['x'], foo['x'], foo['x']) # 11 12 13
attribute dictionary¶
dict with attribute lookup to get contents
attribute lookup and item lookup in one object
class attrdict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.__dict__ = self # assign dict to itself
quux = attrdict()
quux['w'] = 100 # item syntax
print(quux.w) # 100
quux.w = 200 # attribute syntax
print(quux['w']) # 200
# ATTN!!! Be careful with next statement on Windows:
from itertools import count
try:
for num in count(1):
attrdict({i: i for i in range(1024*1024)})
except MemoryError:
print('Out of memory after creating {} attrdicts'.format(num))
# and perform GC immediatelly
from gc import collect; collect() # 5205
# better formulation (without memory circle)
class attrdict(dict):
__getattr__ = dict.__getitem__
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
try:
for num in range(1, 150 + 1):
attrdict({i: i for i in range(1024*1024)})
except MemoryError:
print('Out of memory after creating {} attrdicts'.format(num))
print('Survived creating {} attrdicts'.format(num + 1))
Resume:
It's better do not combine attribute lookup and item lookup in one object!!!